wine_rating = read_csv("./wine_rating.csv")

Wendy’s Part

Wine Price by Country

Average Wine Price by Country

price_country=
  wine_rating |>
  group_by(country)|>
  summarise(avg_by_country = mean(price)) |>
  filter(!is.na(avg_by_country))

World Map of Average Wine Price by Country

world_map_data <- 
  price_country |>
  mutate(text = paste(country, "<br>Avg Price: $", round(avg_by_country, 2)))

fig_country_all <- plot_ly(
  data = world_map_data,
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~avg_by_country,
  text = ~text,
  colorscale = "Viridis"
)

fig_country_all <- 
  fig_country_all|>
  layout(
    geo = list(
      showframe = FALSE,
      projection = list(type = 'mercator')
    ),
    title = "Average Wine Price by Country"
  )

fig_country_all

World Map of The 10 Most Expensive Country

price_country <- 
  price_country |>
  arrange(desc(avg_by_country))

top_10_country = 
  head(price_country, 10) |> 
  mutate(rank = 1:10) |> 
  transform(rank = 1:10)

country10_map_data <- top_10_country |>
  mutate(text = paste("Rank: ", rank, "<br>Country: ", country, "<br>Avg Price: $", round(avg_by_country, 2)))

fig_country_10map <- plot_ly(
  data = country10_map_data,
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~avg_by_country,
  text = ~text,
  colorscale = "YlGnBu"
)

fig_country_10map <- 
  fig_country_10map |>
  layout(
    geo = list(
      showframe = FALSE,
      projection = list(type = 'mercator')
    ),
    title = "Top 10 Countries Based on Average Wine Price"
  )

fig_country_10map

Wine Price by Region

price_region=
  wine_rating |>
  group_by(country,region)|>
  summarise(avg_by_region = mean(price)) |>
  filter(!is.na(avg_by_region))|>
  arrange(desc(avg_by_region))

Bar Plot of The 10 Most Expensive Region

top_10_regions <- head(price_region, 10)

fig_region10 <- plot_ly(
  data = top_10_regions,
  type = "bar",
  x = ~reorder(region, -avg_by_region),
  y = ~avg_by_region,
  color = ~country,
  text = ~paste("Country: ", country, "<br>Avg Price: $", round(avg_by_region, 2)),
  marker = list(size = 10)
)

fig_region10 <- 
  fig_region10 |>
  layout(
    title = "Top 10 Regions Based on Average Wine Price",
    xaxis = list(title = "Region"),
    yaxis = list(title = "Average Wine Price"),
    showlegend = TRUE
  )

fig_region10

Wine Price by Winery

price_winery=
  wine_rating |>
  group_by(country,region, winery)|>
  summarise(avg_by_winery = mean(price)) |>
  filter(!is.na(avg_by_winery))|>
  arrange(desc(avg_by_winery))

Distribution for Top 10 Winery

price_winery_dis <- wine_rating %>%
  group_by(country, region, winery) %>%
  mutate(avg_by_winery = mean(price)) %>%
  filter(!is.na(avg_by_winery)) %>%
  select(country, region, winery, price, avg_by_winery) %>%
  arrange(desc(avg_by_winery))

top_10_wineries <- price_winery_dis %>%
  group_by(avg_by_winery) %>%
  nest() %>%
  arrange(desc(avg_by_winery)) %>%
  head(10) %>%
  unnest(cols = data)

# Create a Plotly boxplot with color-coded markers by region
boxplot_plotly <- plot_ly(
  data = top_10_wineries,
  type = "box",
  x = ~winery,
  y = ~price,
  color = ~region,   # Color by region
  colors = "Set3"    # Choose a color scale (Set3 provides clear distinctions)
)

# Customize layout
boxplot_plotly <- boxplot_plotly %>%
  layout(
    title = "Price Distribution for Top 10 Wineries",
    xaxis = list(title = "Winery"),
    yaxis = list(title = "Price"),
    showlegend = TRUE   # Display legend for region colors
  )

# Display the Plotly plot
boxplot_plotly

Wine Price Distribution

filtered_price <- 
  wine_rating |>
  select(price)|>
  filter(!is.na(price), price > 0)

summary(filtered_price)
##      price         
##  Min.   :   3.150  
##  1st Qu.:   9.902  
##  Median :  15.950  
##  Mean   :  33.025  
##  3rd Qu.:  32.500  
##  Max.   :3410.790

Log-Transformed Price Distribution

histogram_plotly_log <- plot_ly(
  data = filtered_price,
  type = "histogram",
  x = ~log(price),
  nbinsx = 20,  # Adjust the number of bins as needed
  marker = list(color = "skyblue")
)

histogram_plotly_log <- histogram_plotly_log |>
  layout(
    title = "Log-Transformed Price Distribution",
    xaxis = list(title = "Log(Price)"),
    yaxis = list(title = "Frequency")
  )
histogram_plotly_log

Wine Price in the United States

price_us=
  wine_rating |>
  filter(country=="United States")|>
  select(country,region, winery,price)

price_us_200=
  price_us|>
  filter(price<200)

# Create a histogram
histogram_plotly_us <- plot_ly(
  data = price_us_200,
  type = "histogram",
  x = ~price,
  nbinsx = 10,  # Adjust the number of bins as needed
  marker = list(color = "lightblue")
)

# Customize layout
histogram_plotly_us <- histogram_plotly_us |>
  layout(
    title = "Wine Price Distribution in the US (Filtered under 200)",
    xaxis = list(title = "Price"),
    yaxis = list(title = "Frequency")
  )

# Display the histogram
histogram_plotly_us

Allen’s Part

Log transformed price verses rating

ggplot_rp_tf = wine_rating |> 
  ggplot(aes(x = log(price), y = rating, color = year)) +
  geom_point()
ggplotly(ggplot_rp_tf)

Fitting linear model to rating, price, and year(residual plot)

lm_rp = lm(rating ~ log(price) + year, data = wine_rating)
resid_rp = wine_rating |> 
  modelr::add_residuals(lm_rp) |> 
  ggplot(aes(x = log(price), y = resid)) + geom_point()
ggplotly(resid_rp)

Performing Bootstrap for above model for further test

bootstrap_rp = wine_rating |> 
  modelr::bootstrap(n = 1000) |> 
  mutate(
    models = map(strap, \(df) lm(rating ~ log(price) + year, data = df)),
    results = map(models, broom::tidy)) |> 
  select(results) |> 
  unnest(results) |> 
  filter(term == "log(price)") |> 
  ggplot(aes(x = estimate)) + geom_density()
ggplotly(bootstrap_rp)

Wenwen’s Part

Wine ratings Visualizations.

#remove missing values and any duplicates
wine_rating = 
  wine_rating |>
  na.omit() |>
  distinct() |> 
  mutate(name = gsub("\\d", "", winery), year = as.numeric(year),
         name = iconv(name, from = "", to = "UTF-8", sub = ""),
         winery = iconv(winery, from = "", to = "UTF-8", sub = ""),
         region = iconv(region, from = "", to = "UTF-8", sub = ""))

Relationship between Wine prices and rating for different categories.

# Plotly scatter plot with loess smoothing
plot_ly(wine_rating, x = ~price, y = ~rating, type = 'scatter', mode = 'markers', color = ~categories) %>%
  layout(title = "Average wine price by category and production region",
         xaxis = list(title = "Price"),
         yaxis = list(title = "Rating"))

This scatterplot with loess smoothing generally indicates that higher priced wines have higher ratings for all the four categories of wines.

Within countries, highest rating regions

# #Within Countries, highest rating regions
wine_rating_summary <- wine_rating %>%
  group_by(country, region) %>%
  filter(number_of_ratings > 2000) %>%
  summarise(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n()) %>%
  ungroup() %>%
  arrange(region, desc(mean_rating)) %>%
  top_n(20)

# Plotly bar plot
plot_ly(wine_rating_summary, x = ~reorder(region, -mean_rating), y = ~mean_rating, type = 'bar', color = ~country) %>%
  layout(title = "Average wine rating by region (Within countries)",
         xaxis = list(title = "Region"),
         yaxis = list(title = "Mean Rating"))

Italy by far tops the list of the countries with the highest rated wine producing regions. Out of the top 27 regions in terms of mean-rating, almost half (13) of the wine producing regions are from Italy. The graph only shows the regions from which their wines received more than 2000 ratings.

Rating analysis by category in a specific region (e.g., Napa Valley):

# Data Preparation
rating_analysis_data <- wine_rating |>
  filter(region %in% c("Napa Valley", "Napa County", "California")) |>
  filter(categories != "Rose") |> 
  group_by(region, categories) |>
  summarise(mean_rating = mean(rating)) |>
  spread(key = categories, value = mean_rating)

# Convert to matrix (required for heatmap)
rating_matrix <- as.matrix(rating_analysis_data[,-1])
rownames(rating_matrix) <- rating_analysis_data$region

# Plotly Heatmap
fig <- plot_ly(x = colnames(rating_matrix), 
               y = rownames(rating_matrix), 
               z = rating_matrix, 
               type = "heatmap",
               colorscale = "Viridis") %>%
  layout(title = 'Rating Analysis by Wine Category in Napa Valley, Napa County, and California',
         xaxis = list(title = 'Wine Category'),
         yaxis = list(title = 'Region'))

fig

Narrowing down to the ratings of individual regions and the wines that they produce, this graph compares the mean-ratings of the wines from California, Napa County and Napa Valley. From the graph, the Red wines from Napa Valley, had the highest mean-ratings compared to the other two, just as the White category of wines from Napa County had the highest mean rating. Of the three regions considered, only California produced the Rose category of wines, and incidentally had the lowest mean-rating of the categories.

Most Welcoming Wine Valleys

# Filter data if needed
welcoming_valleys_data <- wine_rating |>
  filter(number_of_ratings > 2000) |>
  group_by(region, categories) |>
  summarise(mean_rating = mean(rating), 
            number_of_ratings = number_of_ratings,
            mean_price = mean(price)) |>
  ungroup() |>
  filter(mean_rating > 4.3)

# Create a Plotly Bubble Chart
fig <- plot_ly(data = welcoming_valleys_data, 
               x = ~mean_price, 
               y = ~mean_rating, 
               size = ~number_of_ratings, 
               color = ~region,
               colorscale = "Viridis",
               type = 'scatter', 
               mode = 'markers', 
               marker = list(sizemode = 'diameter', sizeref = 0.7, opacity = 0.4)) %>%
  layout(title = 'Most Welcoming Wine Valleys by Wine Category',
         xaxis = list(title = 'Mean Price'),
         yaxis = list(title = 'Mean Rating'),
         colorway = list('region'))

fig

This bubble plot shows the relationship between the mean-prices and mean-ratings of wines, for only the wines that had more than 2000 ratings, and whose mean-rating was greater than 4.3 all grouped by the wine category. The sizes of the bubbles indicate the number of ratings, while the colors indicate the regions. From the plot, only the wine category of wine makes this cut. Again, only seven wine producing regions make this cut, four of them are from Italy.

Ultraman’s Part

all_wine_cate = wine_rating |> 
  filter(categories != "Varieties") |> 
  group_by(categories) |> 
  summarise(number = n())
all_wine_cate
## # A tibble: 4 × 2
##   categories number
##   <chr>       <int>
## 1 Red          8666
## 2 Rose          397
## 3 Sparkling    1007
## 4 White        3764
price_plot= wine_rating |> 
  plot_ly(y = ~price, x= ~categories, color = ~ categories, type = "box", colors = "viridis") |> 
  layout(yaxis = list(range = c(0, 1500),dtick = 50))
 price_plot
rating_plot= wine_rating |> 
  plot_ly(y = ~rating, x= ~categories, color = ~ categories, type = "box", colors = "viridis") |> 
  layout(yaxis = list(range = c(0, 5), dtick = 0.2))
 rating_plot